{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "ffb79eb1",
   "metadata": {},
   "source": [
    "Code for YouTube data cleaning and sampling. \"Priority 1\" refers to data already filtered to not include non-english, TV clip, or music video channels (but keeping Kids); final samples here have 51 creators, but two were subsequently removed (to make 49), given that they were Kids channels whose label, as such, had not been registered (due to two corrected typos)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f7944af9",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "# Loads list of labeled channels\n",
    "list_of_labelled_channels = pd.read_excel('Youtube Most Viewed Channels Metadata.xlsx', sheet_name='channel_groupby_id')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3a57f727",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load in YT data at different \"priorities\" (based on channel tags for language and content type)\n",
    "channelsp1 = pd.read_csv('priority_list_1.csv')\n",
    "\n",
    "channelsp23 = pd.read_csv('priority_list_2-3.csv')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d883407a",
   "metadata": {},
   "outputs": [],
   "source": [
    "#identify channels that could not be collected\n",
    "\n",
    "p1channelids_set = set(p1channelids)\n",
    "p23channelids_set = set(p23channelids)\n",
    "\n",
    "combined_p1_p23_channelids = p1channelids_set.union(p23channelids_set)\n",
    "\n",
    "labelled_channel_ids_set = set(labeled_channel_ids)\n",
    "\n",
    "missing_channel_ids = list(labelled_channel_ids_set.difference(combined_p1_p23_channelids))\n",
    "\n",
    "missing_channels = list_of_labelled_channels[list_of_labelled_channels['Channel_id'].isin(missing_channel_ids)]\n",
    "\n",
    "print(missing_channels.head())\n",
    "print(missing_channels.info())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f3a92d05",
   "metadata": {},
   "outputs": [],
   "source": [
    "#clean up and merge dfs of data\n",
    "\n",
    "channelsp1 = pd.read_csv('priority_list_1.csv')\n",
    "channelsp23 = pd.read_csv('priority_list_2-3.csv')\n",
    "\n",
    "def combine_columns(df):\n",
    "    df['publishedAt_combined'] = df['publishedAT'].fillna(df['publishedAt'])\n",
    "    return df\n",
    "\n",
    "# Apply the function to both DataFrames\n",
    "channelsp1_combined = combine_columns(channelsp1)\n",
    "channelsp23_combined = combine_columns(channelsp23)\n",
    "\n",
    "# Optionally, you might want to drop the original columns to clean up the DataFrame\n",
    "channelsp1_combined.drop(['publishedAT', 'publishedAt'], axis=1, inplace=True)\n",
    "channelsp23_combined.drop(['publishedAT', 'publishedAt'], axis=1, inplace=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5e7fe78b",
   "metadata": {},
   "outputs": [],
   "source": [
    "#correlate with channel list,labeled; add column for years on top list\n",
    "\n",
    "list_of_labelled_channels.rename(columns={'Channel_id': 'channel_id'}, inplace=True)\n",
    "\n",
    "channelsp1_withchanneltags = pd.merge(channelsp1_combined, \n",
    "                                      list_of_labelled_channels, \n",
    "                                      on='channel_id', \n",
    "                                      how='left')\n",
    "\n",
    "channelsp23_withchanneltags = pd.merge(channelsp23_combined, \n",
    "                                       list_of_labelled_channels, \n",
    "                                       on='channel_id', \n",
    "                                       how='left')\n",
    "\n",
    "channelsp1_withchanneltags['Years On Top Lists'] = channelsp1_withchanneltags['channel_id'].map(aggregated_data)\n",
    "channelsp23_withchanneltags['Years On Top Lists'] = channelsp23_withchanneltags['channel_id'].map(aggregated_data)\n",
    "\n",
    "channelsp1_withchanneltagsandyears = channelsp1_withchanneltags\n",
    "channelsp23_withchanneltagsandyears = channelsp23_withchanneltags"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "66b33004",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Merge the two DataFrames into one\n",
    "allchannelsinfo = pd.concat([channelsp1_withchanneltagsandyears, channelsp23_withchanneltagsandyears])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0b4b2f20",
   "metadata": {},
   "outputs": [],
   "source": [
    "#after all channel data renamed to df; filter by years on pub list\n",
    "\n",
    "df['Published Year'] = df['publishedAt_combined'].dropna().apply(lambda x: x[:4] if pd.notnull(x) else None)\n",
    "\n",
    "df['Years On Top Lists'] = df['Years On Top Lists'].apply(lambda x: [int(year) for year in ast.literal_eval(x)] if pd.notnull(x) else x)\n",
    "\n",
    "df.dropna(subset=['Published Year'], inplace=True)\n",
    "df['Published Year'] = df['Published Year'].astype(int)\n",
    "df_filtered = df[df.apply(lambda row: row['Published Year'] in row['Years On Top Lists'], axis=1)]\n",
    "\n",
    "df_filtered.to_csv(\"allchannelsfullvideodataonlyyearsontoplists.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6314cccd",
   "metadata": {},
   "outputs": [],
   "source": [
    "#inpsect # videos per channel and viewcounts \n",
    "\n",
    "channel_id_counts = df_filtered['channel_id'].value_counts()\n",
    "\n",
    "smallest_counts = channel_id_counts.nsmallest(40)\n",
    "largest_counts = channel_id_counts.nlargest(40)\n",
    "\n",
    "combined_counts = pd.concat([smallest_counts, largest_counts])\n",
    "\n",
    "channel_info = df_filtered[['channel_id', 'channel_title']].drop_duplicates().set_index('channel_id')\n",
    "channel_info['Number of Rows'] = channel_id_counts\n",
    "\n",
    "final_info = channel_info.loc[combined_counts.index].reset_index()\n",
    "final_info['Number of Rows'] = final_info['channel_id'].map(channel_id_counts)\n",
    "\n",
    "print(final_info[['channel_id', 'channel_title', 'Number of Rows']])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0d2d1263",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "df_filtered['viewCount'] = pd.to_numeric(df_filtered['viewCount'], errors='coerce')\n",
    "\n",
    "df_sorted_by_viewCount = df_filtered.sort_values(by='viewCount')\n",
    "\n",
    "top_50_by_viewCount = df_sorted_by_viewCount.tail(50)\n",
    "\n",
    "bottom_50_by_viewCount = df_sorted_by_viewCount.head(50)\n",
    "\n",
    "combined_top_bottom_viewCount = pd.concat([bottom_50_by_viewCount, top_50_by_viewCount])\n",
    "\n",
    "for index, row in combined_top_bottom_viewCount.iterrows():\n",
    "    print(row[['channel_title', 'title','viewCount']])  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0f2dfa89",
   "metadata": {},
   "outputs": [],
   "source": [
    "# filter by viewcount\n",
    "df_filtered['viewCount'] = pd.to_numeric(df_filtered['viewCount'], errors='coerce')\n",
    "\n",
    "df_filtered_sorted = df_filtered.sort_values(by='viewCount', ascending=False, na_position='last')\n",
    "\n",
    "df_filtered_sorted_500kplus = df_filtered_sorted[df_filtered_sorted['viewCount'] >= 500000]\n",
    "\n",
    "df_filtered_sorted_500kplus.to_csv(\"allvideostoplists500kplusviews.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8c48910b",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "priority1_500kplus = df_filtered_sorted_500kplus[df_filtered_sorted_500kplus['Priority'] == 1]\n",
    "\n",
    "priority1_500kplus.to_csv(\"priority1videos_500kplusviews_onlytoplistyears.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d4824c77",
   "metadata": {},
   "outputs": [],
   "source": [
    "#cut kids\n",
    "\n",
    "kids_rows = priority1_500kplus[priority1_500kplus['Note'].str.contains('Kids', na=False)]\n",
    "\n",
    "print(f\"Number of rows with 'Kids' in the Note: {len(kids_rows)}\")\n",
    "\n",
    "print(kids_rows.sample(n=min(5, len(kids_rows)), random_state=42))\n",
    "\n",
    "priority1_500kplus_nokids = priority1_500kplus[~priority1_500kplus['Note'].str.contains(\"Kids\", na=False)]\n",
    "\n",
    "priority1_500kplus_nokids.to_csv(\"priority1videos_500kplusshares_nokids.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c818065a",
   "metadata": {},
   "outputs": [],
   "source": [
    "#make samples\n",
    "#after creators with less than 10 videos have also been manually deleted\n",
    "#run twice, order scrambled\n",
    "\n",
    "file_name = 'priority1videos_500kplusshares_nokids.csv'\n",
    "df = pd.read_csv(file_name)\n",
    "\n",
    "print(df.head())\n",
    "\n",
    "# Sample 10 videos per creator\n",
    "def sample_videos(df, videos_per_creator=10):\n",
    "    sampled_df = df.groupby('channel_id').sample(n=videos_per_creator, random_state=1)\n",
    "    return sampled_df\n",
    "\n",
    "sampled_df = sample_videos(filtered_df)\n",
    "\n",
    "videos_per_year = sampled_df['Published Year'].value_counts().sort_index()\n",
    "\n",
    "print(\"Number of videos per year in the sampled dataset:\")\n",
    "print(videos_per_year)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6056a441",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
